import os
import sys

from zkl_aiutils_datasets import load_dataset
from zkl_llmpt_datasets import TokenizedDataset

project_dir_path = os.path.join(os.path.dirname(__file__), "../../..")
sys.path.append(project_dir_path)

from scripts.config import datasets_dir_path

dataset_path = os.path.join(datasets_dir_path, "preprocessed/v5.2")

dataset = load_dataset(dataset_path)
assert isinstance(dataset, TokenizedDataset)

print(f"samples_n={len(dataset)}")
print(f"tokens_n={dataset.total_tokens_n}")
print()

for sample in dataset:
    sample_array = sample
    sample_text = "".join(dataset.vocab.get_token(token) for token in sample)
    print(sample_array)
    print(sample_text)
    input("(press enter to continue)")
